In [1]:
# Import packages
import pandas as pd
import numpy as np
import os
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import cufflinks as cf
cf.go_offline()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.python.ops.gen_array_ops import size

from keras.layers.advanced_activations import LeakyReLU

from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score


BATH_PATH = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(BATH_PATH, "..", "data")
In [2]:
# Read data
def get_PET_history():
    path = os.path.join(data_path, "PET_data-20210401_20210831.csv")
    PET_data = pd.read_csv(path)
    PET_data["Time"] = pd.to_datetime(PET_data["Time"])
    PET_data.sort_values(by="Time", inplace=True)
    PET_data.set_index("Time", inplace=True)
    PET_data = PET_data.loc[~PET_data.index.duplicated(keep="first")]
    return PET_data


def get_FCU_history():
    path = os.path.join(data_path, "FCU_data-20210201_20210831.csv")
    FCU_data = pd.read_csv(path)
    FCU_data["Time"] = pd.to_datetime(FCU_data["Time"])
    FCU_data.sort_values(by="Time", inplace=True)
    FCU_data.set_index("Time", inplace=True)
    FCU_data = FCU_data.loc[~FCU_data.index.duplicated(keep='first')]
    return FCU_data


def get_AHU_history():
    path = os.path.join(data_path, F"AHU_data-20210201_20210831.csv")
    AHU_data = pd.read_csv(path)
    AHU_data["Time"] = pd.to_datetime(AHU_data["Time"])
    AHU_data.sort_values(by="Time", inplace=True)
    AHU_data.set_index("Time", inplace=True)
    AHU_data = AHU_data.loc[~AHU_data.index.duplicated(keep='first')]
    return AHU_data


def get_CO2_history():
    path = os.path.join(data_path, F"CDS_data-20210201_20210831.csv")
    CDS_data = pd.read_csv(path)
    CDS_data["Time"] = pd.to_datetime(CDS_data["Time"])
    CDS_data.sort_values(by="Time", inplace=True)
    CDS_data.set_index("Time", inplace=True)
    CDS_data = CDS_data.drop("CDS_108_8-CO2", axis=1) # deviation in this sensor data 
    CDS_data = CDS_data.loc[~CDS_data.index.duplicated(keep='first')]
    CDS_data["Floor_CO2_mean"] = CDS_data.mean(axis=1)
    return CDS_data.loc[:, ["Floor_CO2_mean"]]


def get_CWB_history():
    path = os.path.join(data_path, "CWB_data-20210201_20210831.csv")
    CWB_data = pd.read_csv(path)
    CWB_data["Time"] = pd.to_datetime(CWB_data["Time"])
    CWB_data.sort_values(by="Time", inplace=True)
    CWB_data.set_index("Time", inplace=True)
    CWB_data = CWB_data.loc[~CWB_data.index.duplicated(keep='first')]
    return CWB_data


def merge_data():
    CWB_data = get_CWB_history()
    FCU_data = get_FCU_history()
    AHU_data = get_AHU_history()
    CDS_data = get_CO2_history()
    PET_data = get_PET_history()
    ALL_data = pd.concat([CWB_data, FCU_data, AHU_data, CDS_data, PET_data], axis=1, join="outer")
    # Drop insignificant or noisy variable 
    ALL_data = ALL_data.drop(["RH", "GloblRad", "PAH-OA_FLOW", "PAH-FAN_RUN_CMD", "PAH-DMP_POS", "PET_Target"], axis=1)
    return ALL_data.dropna()

display(merge_data())
merge_data().iplot()
Temperature FCU-RA_T FCU-SP FCU-ST FCU-WST PAH-VFD_SPD PAH-ST1 PAH-SA_T Floor_CO2_mean PET_Indoor SP_Target
Time
2021-04-01 00:00:00+08:00 24.500000 22.313543 23.1 0.0 3.0 0.0 0.0 22.400000 481.424007 22.520543 23.0
2021-04-01 00:10:00+08:00 24.483333 22.300000 23.1 0.0 3.0 0.0 0.0 22.400000 480.063036 22.518000 23.2
2021-04-01 00:20:00+08:00 24.466667 22.300000 23.1 0.0 3.0 0.0 0.0 22.400000 478.947236 22.517000 23.2
2021-04-01 00:30:00+08:00 24.450000 22.280525 23.1 0.0 3.0 0.0 0.0 22.400000 477.952797 22.501525 23.2
2021-04-01 00:40:00+08:00 24.433333 22.200000 23.1 0.0 3.0 0.0 0.0 22.400000 477.826316 22.444000 23.3
... ... ... ... ... ... ... ... ... ... ... ...
2021-08-31 23:10:00+08:00 29.350000 23.697488 25.0 0.0 3.0 0.0 0.0 24.000000 427.503254 23.921000 25.0
2021-08-31 23:20:00+08:00 29.300000 23.600000 25.0 0.0 3.0 0.0 0.0 24.068112 427.301354 23.920000 25.0
2021-08-31 23:30:00+08:00 29.250000 23.600000 25.0 0.0 3.0 0.0 0.0 24.149771 426.716231 23.920000 25.0
2021-08-31 23:40:00+08:00 29.200000 23.567579 25.0 0.0 3.0 0.0 0.0 24.254774 426.336100 23.920000 25.0
2021-08-31 23:50:00+08:00 29.150000 23.500000 25.0 0.0 3.0 0.0 0.0 24.300000 426.310519 23.919000 25.0

21960 rows × 11 columns

In [3]:
# Create timeseries features (here is just the sample not the real structure in the field)
def get_train_feature(target, future_step=1):
    # Preparing features (can also add some features if you'd like to try other models)
    data = merge_data()
    
    # data.insert(loc=0, column="DoY", value=(data.index.dayofyear).values) # seems not significant
    data.insert(loc=1, column="Hour", value=(data.index.hour).values + (pd.Series(data.index).dt.minute).values/60)
    data.insert(loc=2, column="DoW", value=(data.index.dayofweek).values)
    data.insert(loc=3, column="HoW", value=(data["Hour"] + 24 * data.index.dayofweek).values)

    # features set (add lag features for future prediction: the [t-1] situation leads to [t] outcome)
    for col in data.loc[:, ~data.columns.str.contains(target)].columns:
        data.loc[:, F"{col}-lag"] = data.loc[:, col].shift(future_step)
    
    # target can also shift as lag features (but seems not significant)
    # data.loc[:, F"{target}-lag"] = data.loc[:, target].shift(future_step)

    return data.dropna()

# Scale data may get some help
def data_scale(dataset, scale_type="MinMax"):
    if scale_type =="MinMax":
        data_min = dataset.min()
        data_max = dataset.max()
        scaled_df = (dataset - data_min) / (data_max - data_min)
        pars = {"data_min": data_min, "data_max": data_max}
        return scaled_df, pars
    if scale_type =="normalize":
        data_mean = dataset.mean()
        data_std = dataset.std()
        normalized_df = (dataset - data_mean) / data_std
        pars = {"data_mean": data_mean, "data_std": data_std}
        return normalized_df, pars

# Split train, valid and test data
def generator(data, start, end):
    X = data.loc[start:end, ~data.columns.str.contains(target)]
    Y = data.loc[start:end, data.columns.str.contains(target)]
    return X, Y

target = "SP_Target"
future_step = 6 * 1 # timestep is 10 min, 6 timestep for 1 hr lag-feature

# data = get_train_feature(target, future_step) # can also try training data without scaled
data = data_scale(get_train_feature(target, future_step))[0]
pars = data_scale(get_train_feature(target, future_step))[1]

# train data
start = "2021-04-01"
end = "2021-06-30"
_train = generator(data, start, end)
train_x = _train[0]
train_y = _train[1]

# valid data
start = "2021-05-31"
end = "2021-07-31"
_val = generator(data, start, end)
val_x = _val[0]
val_y = _val[1]

# test data
start = "2021-06-30"
end = "2021-08-31"
_test = generator(data, start, end)
test_x = _test[0]
test_y = _test[1]

# all data
start = "2021-04-01"
end = "2021-08-31"
_all = generator(data, start, end)
all_x = _all[0]
all_y = _all[1]

display(all_x)
display(all_y)
Temperature Hour DoW HoW FCU-RA_T FCU-SP FCU-ST FCU-WST PAH-VFD_SPD PAH-ST1 ... HoW-lag FCU-RA_T-lag FCU-SP-lag FCU-ST-lag FCU-WST-lag PAH-VFD_SPD-lag PAH-ST1-lag PAH-SA_T-lag Floor_CO2_mean-lag PET_Indoor-lag
Time
2021-04-01 01:00:00+08:00 0.383178 0.041958 0.500000 0.434955 0.145985 0.183333 0.0 1.0 0.0 0.0 ... 0.428997 0.166704 0.183333 0.0 1.0 0.0 0.0 0.664688 0.708873 0.522040
2021-04-01 01:10:00+08:00 0.383956 0.048951 0.500000 0.435948 0.145985 0.183333 0.0 1.0 0.0 0.0 ... 0.429990 0.164233 0.183333 0.0 1.0 0.0 0.0 0.664688 0.706869 0.521720
2021-04-01 01:20:00+08:00 0.384735 0.055944 0.500000 0.436941 0.145985 0.183333 0.0 1.0 0.0 0.0 ... 0.430983 0.164233 0.183333 0.0 1.0 0.0 0.0 0.664688 0.705226 0.521594
2021-04-01 01:30:00+08:00 0.385514 0.062937 0.500000 0.437934 0.145985 0.183333 0.0 1.0 0.0 0.0 ... 0.431976 0.160679 0.183333 0.0 1.0 0.0 0.0 0.664688 0.703762 0.519643
2021-04-01 01:40:00+08:00 0.386293 0.069930 0.500000 0.438928 0.145985 0.183333 0.0 1.0 0.0 0.0 ... 0.432969 0.145985 0.183333 0.0 1.0 0.0 0.0 0.664688 0.703576 0.512391
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-08-31 23:10:00+08:00 0.614486 0.972028 0.166667 0.281033 0.419247 0.500000 0.0 1.0 0.0 0.0 ... 0.275074 0.528267 0.500000 0.0 1.0 0.0 0.0 0.685993 0.631985 0.734818
2021-08-31 23:20:00+08:00 0.612150 0.979021 0.166667 0.282026 0.401458 0.500000 0.0 1.0 0.0 0.0 ... 0.276068 0.499538 0.500000 0.0 1.0 0.0 0.0 0.693065 0.631117 0.719315
2021-08-31 23:30:00+08:00 0.609813 0.986014 0.166667 0.283019 0.401458 0.500000 0.0 1.0 0.0 0.0 ... 0.277061 0.476726 0.500000 0.0 1.0 0.0 0.0 0.697126 0.630624 0.708473
2021-08-31 23:40:00+08:00 0.607477 0.993007 0.166667 0.284012 0.395541 0.500000 0.0 1.0 0.0 0.0 ... 0.278054 0.456350 0.500000 0.0 1.0 0.0 0.0 0.702158 0.630276 0.698809
2021-08-31 23:50:00+08:00 0.605140 1.000000 0.166667 0.285005 0.383209 0.500000 0.0 1.0 0.0 0.0 ... 0.279047 0.440235 0.500000 0.0 1.0 0.0 0.0 0.706231 0.630140 0.698833

21954 rows × 26 columns

SP_Target
Time
2021-04-01 01:00:00+08:00 0.366667
2021-04-01 01:10:00+08:00 0.366667
2021-04-01 01:20:00+08:00 0.366667
2021-04-01 01:30:00+08:00 0.366667
2021-04-01 01:40:00+08:00 0.366667
... ...
2021-08-31 23:10:00+08:00 0.666667
2021-08-31 23:20:00+08:00 0.666667
2021-08-31 23:30:00+08:00 0.666667
2021-08-31 23:40:00+08:00 0.666667
2021-08-31 23:50:00+08:00 0.666667

21954 rows × 1 columns

In [4]:
# Model train (the model here is one of the optimized sample)
model_summary = pd.DataFrame()
Epochs = 30
Batch = 6 * 24 * 7

tf.random.set_seed(221)
model = Sequential()
model.add(Dense(units=8, input_dim=train_x.shape[1], activation="relu"))
model.add(Dense(units=16, activation="selu"))
model.add(Dense(units=32, activation=LeakyReLU()))
model.add(Dense(units=16, activation="selu"))
model.add(Dense(units=8, activation="relu"))
model.add(Dense(units=1))

opt = keras.optimizers.Adam()
model.compile(loss="mae", optimizer=opt, metrics=["mse"])
model.fit(train_x, train_y, epochs=Epochs, batch_size=Batch, validation_data=(val_x, val_y))

y_pred = model.predict(test_x)
test_y["pred"] = y_pred[:, 0]
test_y[target] = test_y[target]
test_y["pred"] = test_y["pred"]

fig = go.Figure()
fig.add_trace(go.Scatter(name="real", x=test_y.index, y=test_y[target]))
fig.add_trace(go.Scatter(name="pred", x=test_y.index, y=test_y["pred"]))
fig.show()

test_y = test_y.dropna()

print("r2: ", round(r2_score(test_y[target], test_y["pred"]), 2))
print("MAE: ", round(mae(test_y[target], test_y["pred"]), 2))
print("RMSE: ", round(mse(test_y[target], test_y["pred"]), 2))

model.summary()
Epoch 1/30
13/13 [==============================] - 1s 14ms/step - loss: 0.1130 - mse: 0.0227 - val_loss: 0.0777 - val_mse: 0.0092
Epoch 2/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0899 - mse: 0.0159 - val_loss: 0.0545 - val_mse: 0.0066
Epoch 3/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0819 - mse: 0.0144 - val_loss: 0.0505 - val_mse: 0.0059
Epoch 4/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0765 - mse: 0.0133 - val_loss: 0.0484 - val_mse: 0.0056
Epoch 5/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0722 - mse: 0.0122 - val_loss: 0.0471 - val_mse: 0.0052
Epoch 6/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0685 - mse: 0.0113 - val_loss: 0.0462 - val_mse: 0.0048
Epoch 7/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0656 - mse: 0.0106 - val_loss: 0.0441 - val_mse: 0.0044
Epoch 8/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0633 - mse: 0.0101 - val_loss: 0.0424 - val_mse: 0.0041
Epoch 9/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0618 - mse: 0.0099 - val_loss: 0.0411 - val_mse: 0.0038
Epoch 10/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0607 - mse: 0.0095 - val_loss: 0.0401 - val_mse: 0.0037
Epoch 11/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0598 - mse: 0.0094 - val_loss: 0.0399 - val_mse: 0.0036
Epoch 12/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0589 - mse: 0.0094 - val_loss: 0.0397 - val_mse: 0.0036
Epoch 13/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0580 - mse: 0.0092 - val_loss: 0.0375 - val_mse: 0.0033
Epoch 14/30
13/13 [==============================] - 0s 4ms/step - loss: 0.0569 - mse: 0.0091 - val_loss: 0.0372 - val_mse: 0.0032
Epoch 15/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0564 - mse: 0.0090 - val_loss: 0.0369 - val_mse: 0.0030
Epoch 16/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0562 - mse: 0.0089 - val_loss: 0.0361 - val_mse: 0.0031
Epoch 17/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0554 - mse: 0.0088 - val_loss: 0.0348 - val_mse: 0.0029
Epoch 18/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0553 - mse: 0.0088 - val_loss: 0.0342 - val_mse: 0.0028
Epoch 19/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0546 - mse: 0.0087 - val_loss: 0.0350 - val_mse: 0.0029
Epoch 20/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0545 - mse: 0.0087 - val_loss: 0.0329 - val_mse: 0.0027
Epoch 21/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0537 - mse: 0.0086 - val_loss: 0.0337 - val_mse: 0.0027
Epoch 22/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0533 - mse: 0.0086 - val_loss: 0.0327 - val_mse: 0.0027
Epoch 23/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0529 - mse: 0.0085 - val_loss: 0.0339 - val_mse: 0.0027
Epoch 24/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0531 - mse: 0.0085 - val_loss: 0.0318 - val_mse: 0.0026
Epoch 25/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0525 - mse: 0.0084 - val_loss: 0.0314 - val_mse: 0.0026
Epoch 26/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0527 - mse: 0.0084 - val_loss: 0.0311 - val_mse: 0.0026
Epoch 27/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0518 - mse: 0.0083 - val_loss: 0.0303 - val_mse: 0.0025
Epoch 28/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0530 - mse: 0.0083 - val_loss: 0.0341 - val_mse: 0.0026
Epoch 29/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0532 - mse: 0.0082 - val_loss: 0.0308 - val_mse: 0.0025
Epoch 30/30
13/13 [==============================] - 0s 3ms/step - loss: 0.0509 - mse: 0.0081 - val_loss: 0.0298 - val_mse: 0.0024
r2:  0.64
MAE:  0.04
RMSE:  0.0
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 8)                 216       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_3 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
=================================================================
Total params: 1,577
Trainable params: 1,577
Non-trainable params: 0
_________________________________________________________________
In [5]:
# Re-scaled the prediction
plot_df = test_y * (pars["data_max"][target] - pars["data_min"][target]) + pars["data_min"][target]
plot_df.iplot(title=F"{target}-real vs prediction")
In [6]:
# this model only have to be precise at the time we need
# for such a special application, here can use schedule rules to optimize predictive strategy
plot_df["pred"] = np.where(np.logical_or(plot_df.index.dayofweek>4, np.logical_or(plot_df.index.hour<7, plot_df.index.hour>21)), 25, plot_df["pred"])
plot_df.iplot(title=F"{target}-real vs prediction")

# the control strategy of HVAC Setpoint temperature assure the indoor thermal comfort
print("r2: ", round(r2_score(plot_df[target], plot_df["pred"]), 2))
print("MAE: ", round(mae(plot_df[target], plot_df["pred"]), 2))
print("RMSE: ", round(mse(plot_df[target], plot_df["pred"]), 2))
r2:  0.84
MAE:  0.13
RMSE:  0.07